package cn.keyvalue.ccf.base;

import java.io.IOException;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class BaseCrawler {

	//访问地址
	private Document doc;
	private String url;
	
	public String getUrl()
	{
		return url;
	}
	
	public BaseCrawler( String url ) throws IOException
	{
		this.url = url;
		this.doc = Jsoup.connect(url).timeout(30000).get(); 
	}
	
	/**
	 * 解析页面的title
	 * @return
	 */
	public String getTitle()
	{
		String title = doc.title();
		return StringUtils.trimToEmpty(title);
	}
	
	/**
	 * 解析页面的description
	 * @return
	 */
	public String getDescription()
	{
		String cssQuery = "meta[name=description]";
		Elements eles = doc.select(cssQuery);
		
		String description = "";
		if( eles != null && eles.size() > 0 )
		{
			description = doc.select(cssQuery).get(0).attr("content");
		}
		
		return StringUtils.trimToEmpty(description);
	}
	
	/**
	 * 解析页面的keywords
	 * @return
	 */
	public String getKeywords()
	{
		String cssQuery = "meta[name=keywords]";
		Elements eles = doc.select(cssQuery);
		
		String keywords = "";
		if( eles != null && eles.size() > 0 )
		{
			keywords = doc.select(cssQuery).get(0).attr("content");
		}
		
		return StringUtils.trimToEmpty(keywords);		
	}
	
	/**
	 * 一般解析对象需要重写该方法
	 * 默认body
	 * @return
	 */
	public String getContent()
	{
		Elements elements = doc.getElementsByTag("body");
		return StringUtils.trimToEmpty(elements.html());
	}
	
	
	public static void main( String[] args ) throws IOException
	{
		String url = "http://www.smzdm.com/youhui/253907";
		BaseCrawler crawler = new BaseCrawler(url);
		
		System.out.println( crawler.getTitle() );
		System.out.println( crawler.getKeywords() );
		System.out.println( crawler.getDescription() );
		System.out.println( crawler.getContent() );		
	}
}